# Run this code when you restart the machine
# Fill in with YOUR name and NIM
import datetime
import uuid
myName = "Christopher Darren"
myNIM = "00000054804"
myDate = datetime.datetime.now()
myDevice = str(uuid.uuid1())
print("Name: \t\t{}".format(myName))
print("NIM: \t\t{}".format(myNIM))
print("Start: \t\t{}".format(myDate))
print("Device ID: \t{}".format(myDevice))
Name: Christopher Darren NIM: 00000054804 Start: 2023-05-04 08:08:58.986644 Device ID: 3fdc3dcd-ea18-11ed-87cb-f02f74a116e8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r"D:\SEMESTER 4\IF540 Machine Learning\LAB\week10\all_seasons.csv")
df.shape
(12305, 22)
df.head()
| Unnamed: 0 | player_name | team_abbreviation | age | player_height | player_weight | college | country | draft_year | draft_round | ... | pts | reb | ast | net_rating | oreb_pct | dreb_pct | usg_pct | ts_pct | ast_pct | season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Dennis Rodman | CHI | 36.0 | 198.12 | 99.790240 | Southeastern Oklahoma State | USA | 1986 | 2 | ... | 5.7 | 16.1 | 3.1 | 16.1 | 0.186 | 0.323 | 0.100 | 0.479 | 0.113 | 1996-97 |
| 1 | 1 | Dwayne Schintzius | LAC | 28.0 | 215.90 | 117.933920 | Florida | USA | 1990 | 1 | ... | 2.3 | 1.5 | 0.3 | 12.3 | 0.078 | 0.151 | 0.175 | 0.430 | 0.048 | 1996-97 |
| 2 | 2 | Earl Cureton | TOR | 39.0 | 205.74 | 95.254320 | Detroit Mercy | USA | 1979 | 3 | ... | 0.8 | 1.0 | 0.4 | -2.1 | 0.105 | 0.102 | 0.103 | 0.376 | 0.148 | 1996-97 |
| 3 | 3 | Ed O'Bannon | DAL | 24.0 | 203.20 | 100.697424 | UCLA | USA | 1995 | 1 | ... | 3.7 | 2.3 | 0.6 | -8.7 | 0.060 | 0.149 | 0.167 | 0.399 | 0.077 | 1996-97 |
| 4 | 4 | Ed Pinckney | MIA | 34.0 | 205.74 | 108.862080 | Villanova | USA | 1985 | 1 | ... | 2.4 | 2.4 | 0.2 | -11.2 | 0.109 | 0.179 | 0.127 | 0.611 | 0.040 | 1996-97 |
5 rows × 22 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12305 entries, 0 to 12304 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 12305 non-null int64 1 player_name 12305 non-null object 2 team_abbreviation 12305 non-null object 3 age 12305 non-null float64 4 player_height 12305 non-null float64 5 player_weight 12305 non-null float64 6 college 12305 non-null object 7 country 12305 non-null object 8 draft_year 12305 non-null object 9 draft_round 12305 non-null object 10 draft_number 12305 non-null object 11 gp 12305 non-null int64 12 pts 12305 non-null float64 13 reb 12305 non-null float64 14 ast 12305 non-null float64 15 net_rating 12305 non-null float64 16 oreb_pct 12305 non-null float64 17 dreb_pct 12305 non-null float64 18 usg_pct 12305 non-null float64 19 ts_pct 12305 non-null float64 20 ast_pct 12305 non-null float64 21 season 12305 non-null object dtypes: float64(12), int64(2), object(8) memory usage: 2.1+ MB
df.describe()
| Unnamed: 0 | age | player_height | player_weight | gp | pts | reb | ast | net_rating | oreb_pct | dreb_pct | usg_pct | ts_pct | ast_pct | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 | 12305.000000 |
| mean | 6152.000000 | 27.084518 | 200.611602 | 100.369926 | 51.290532 | 8.172775 | 3.559155 | 1.813986 | -2.255733 | 0.054473 | 0.141014 | 0.184891 | 0.511060 | 0.131358 |
| std | 3552.291866 | 4.335868 | 9.146321 | 12.477150 | 25.095909 | 5.974957 | 2.483550 | 1.794155 | 12.673254 | 0.043599 | 0.062704 | 0.053390 | 0.101163 | 0.094483 |
| min | 0.000000 | 18.000000 | 160.020000 | 60.327736 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | -250.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 3076.000000 | 24.000000 | 193.040000 | 90.718400 | 31.000000 | 3.600000 | 1.800000 | 0.600000 | -6.400000 | 0.021000 | 0.096000 | 0.149000 | 0.480000 | 0.066000 |
| 50% | 6152.000000 | 26.000000 | 200.660000 | 99.790240 | 57.000000 | 6.700000 | 3.000000 | 1.200000 | -1.300000 | 0.041000 | 0.131000 | 0.181000 | 0.524000 | 0.103000 |
| 75% | 9228.000000 | 30.000000 | 208.280000 | 108.862080 | 73.000000 | 11.500000 | 4.700000 | 2.400000 | 3.200000 | 0.084000 | 0.180000 | 0.217000 | 0.561000 | 0.178000 |
| max | 12304.000000 | 44.000000 | 231.140000 | 163.293120 | 85.000000 | 36.100000 | 16.300000 | 11.700000 | 300.000000 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 1.000000 |
df.isnull().sum()
Unnamed: 0 0 player_name 0 team_abbreviation 0 age 0 player_height 0 player_weight 0 college 0 country 0 draft_year 0 draft_round 0 draft_number 0 gp 0 pts 0 reb 0 ast 0 net_rating 0 oreb_pct 0 dreb_pct 0 usg_pct 0 ts_pct 0 ast_pct 0 season 0 dtype: int64
#numerical = ['age','player_height','player_weight','draft_number','gp','pts','reb','ast','net_rating','oreb_pct','dreb_pct',
# 'usg_pct','ts_pct','ast_pct']
numerical = ['age','player_height','player_weight','pts']
categorical = ['team_abbreviation','country','draft_year','draft_round']
unused = ['Unnamed: 0','player_name','college','season']
df = df.drop(columns = unused)
df.shape
(12305, 18)
for cat in categorical:
df[cat] = df[cat].fillna(df[cat].mode().values[0])
df.isnull().sum()
team_abbreviation 0 age 0 player_height 0 player_weight 0 country 0 draft_year 0 draft_round 0 draft_number 0 gp 0 pts 0 reb 0 ast 0 net_rating 0 oreb_pct 0 dreb_pct 0 usg_pct 0 ts_pct 0 ast_pct 0 dtype: int64
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x18da9957640>
fig = plt.figure(figsize = (25,25))
axes = 320
for cat in categorical:
axes += 1
fig.add_subplot(axes)
sns.countplot(data = df, x =cat)
plt.xticks(rotation=30)
plt.show()
#create correlation
corr = df.corr(method ='pearson')
#convert correlation to numpy array
mask = np.array(corr)
#to mask the repetitive value for each pair
mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots(figsize = (15, 12))
fig.set_size_inches(15,15)
sns.heatmap(corr, mask = mask, vmax = 0.9, square = True, annot = True)
<AxesSubplot:>
df_cluster = pd.DataFrame()
#df_cluster['usg_pct'] = df['usg_pct']
df_cluster['age'] = df['age']
#df_cluster['ast_pct'] = df['ast_pct']
df_cluster['player_height'] = df['player_height']
df_cluster['player_weight'] = df['player_weight']
#df_cluster['gp'] = df['gp']
df_cluster['pts'] = df['pts']
#df_cluster['reb'] = df['reb']
#df_cluster['ast'] = df['ast']
#df_cluster['net_rating'] = df['net_rating']
#df_cluster['oreb_pct'] = df['oreb_pct']
#df_cluster['dreb_pct'] = df['dreb_pct']
#df_cluster['ts_pct'] = df['ts_pct']
#df_cluster['draft_round'] = df['draft_round']
df_cluster.head()
| age | player_height | player_weight | pts | |
|---|---|---|---|---|
| 0 | 36.0 | 198.12 | 99.790240 | 5.7 |
| 1 | 28.0 | 215.90 | 117.933920 | 2.3 |
| 2 | 39.0 | 205.74 | 95.254320 | 0.8 |
| 3 | 24.0 | 203.20 | 100.697424 | 3.7 |
| 4 | 34.0 | 205.74 | 108.862080 | 2.4 |
fig = plt.figure(figsize = (15,10))
axes = 220
for num in numerical:
axes += 1
fig.add_subplot(axes)
sns.boxplot(data = df, x= num)
plt.show()
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4,1, figsize=(8,8))
sns.distplot(df["age"], ax=ax1)
sns.distplot(df["player_height"], ax=ax2)
sns.distplot(df["player_weight"], ax=ax3)
sns.distplot(df["pts"], ax=ax4)
plt.tight_layout()
plt.legend()
C:\Users\Darren\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Darren\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Darren\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Darren\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
<matplotlib.legend.Legend at 0x18db79a9bb0>
df_cluster_log = np.log(df_cluster[['age','player_height','player_weight']])
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8,8))
sns.distplot(df_cluster_log["age"], ax=ax1)
sns.distplot(df_cluster_log["player_height"], ax=ax2)
sns.distplot(df_cluster_log["player_weight"], ax=ax3)
plt.tight_layout()
C:\Users\Darren\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Darren\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\Darren\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
df_cluster_log.head()
| age | player_height | player_weight | |
|---|---|---|---|
| 0 | 3.583519 | 5.288873 | 4.603070 |
| 1 | 3.332205 | 5.374815 | 4.770124 |
| 2 | 3.663562 | 5.326613 | 4.556550 |
| 3 | 3.178054 | 5.314191 | 4.612120 |
| 4 | 3.526361 | 5.326613 | 4.690082 |
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cluster_scaled = scaler.fit_transform(df_cluster_log)
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(20,10))
dendrogram = sch.dendrogram(sch.linkage(cluster_scaled, method='ward'))
from sklearn.cluster import AgglomerativeClustering
from mpl_toolkits.mplot3d import Axes3D
model = AgglomerativeClustering(n_clusters=3)
model.fit(cluster_scaled)
hac_labels = model.labels_
fig = plt.figure(num=None, figsize=(15,20), dpi=80, facecolor='w', edgecolor='k')
ax = plt.axes(projection="3d")
ax.scatter3D(df_cluster['age'],df_cluster['player_height'],df_cluster['player_weight'],c=hac_labels,cmap='rainbow')
xLabel = ax.set_xlabel('age', linespacing=3.2)
yLabel = ax.set_ylabel('player_height', linespacing=3.1)
zLabel = ax.set_zlabel('player_weight', linespacing=3.4)
print("Hierarchical Agglomerative Clustering")
Hierarchical Agglomerative Clustering
df_clustered_hac = df_cluster.assign(Cluster=hac_labels)
grouped_hac = df_clustered_hac.groupby(['Cluster']).mean().round(1)
grouped_hac
| age | player_height | player_weight | pts | |
|---|---|---|---|---|
| Cluster | ||||
| 0 | 25.9 | 192.2 | 89.7 | 8.4 |
| 1 | 30.6 | 205.4 | 107.1 | 8.3 |
| 2 | 23.3 | 208.2 | 108.9 | 7.5 |
#The Dendrogram using breastcancer dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv(r'D:\SEMESTER 4\IF540 Machine Learning\LAB\week10\data.csv')
data.head()
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
5 rows × 33 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(1), object(1) memory usage: 146.8+ KB
data.shape
(569, 33)
#encoding
def encode_data(feature_name):
'''
This function takes feature name as a parameter and returns mapping dictionary to replace(or map) categorical data with
'''
mapping_dict = {}
unique_values = list(dataLogistic[feature_name].unique())
for idx in range(len(unique_values)):
mapping_dict[unique_values[idx]] = idx
return mapping_dict
data['diagnosis'].replace({'M':0,'B': 1}, inplace = True)
data.isnull().sum()
id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 Unnamed: 32 569 dtype: int64
cleandata = data.drop(['Unnamed: 32','id'], axis=1)
cleandata
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | ... | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
| 1 | 0 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | ... | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
| 2 | 0 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | ... | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
| 3 | 0 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | ... | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
| 4 | 0 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | ... | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 0 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | ... | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 |
| 565 | 0 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | ... | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 |
| 566 | 0 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | ... | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 |
| 567 | 0 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | ... | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 |
| 568 | 1 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | ... | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 |
569 rows × 31 columns
from sklearn.preprocessing import normalize
data_scaled=normalize(cleandata)
data_scaled=pd.DataFrame(data_scaled, columns=cleandata.columns)
data_scaled.head()
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.007925 | 0.004573 | 0.054099 | 0.440986 | 0.000052 | 0.000122 | 0.000132 | 0.000065 | 0.000107 | ... | 0.011181 | 0.007635 | 0.081325 | 0.889462 | 0.000071 | 0.000293 | 0.000314 | 0.000117 | 0.000203 | 0.000052 |
| 1 | 0.0 | 0.008666 | 0.007486 | 0.055988 | 0.558619 | 0.000036 | 0.000033 | 0.000037 | 0.000030 | 0.000076 | ... | 0.010528 | 0.009862 | 0.066899 | 0.824026 | 0.000052 | 0.000079 | 0.000102 | 0.000078 | 0.000116 | 0.000038 |
| 2 | 0.0 | 0.009367 | 0.010109 | 0.061842 | 0.572276 | 0.000052 | 0.000076 | 0.000094 | 0.000061 | 0.000098 | ... | 0.011212 | 0.012145 | 0.072545 | 0.812984 | 0.000069 | 0.000202 | 0.000214 | 0.000116 | 0.000172 | 0.000042 |
| 3 | 0.0 | 0.016325 | 0.029133 | 0.110899 | 0.551922 | 0.000204 | 0.000406 | 0.000345 | 0.000150 | 0.000371 | ... | 0.021314 | 0.037881 | 0.141333 | 0.811515 | 0.000300 | 0.001238 | 0.000982 | 0.000368 | 0.000949 | 0.000247 |
| 4 | 0.0 | 0.009883 | 0.006985 | 0.065808 | 0.631774 | 0.000049 | 0.000065 | 0.000096 | 0.000051 | 0.000088 | ... | 0.010979 | 0.008120 | 0.074137 | 0.767189 | 0.000067 | 0.000100 | 0.000195 | 0.000079 | 0.000115 | 0.000037 |
5 rows × 31 columns
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(10,7))
plt.title("dendrograms")
dend=shc.dendrogram(shc.linkage(data_scaled,method='ward'))
plt.figure(figsize=(10,7))
plt.title("Dendrograms")
dend=shc.dendrogram(shc.linkage(data_scaled,method='ward'))
plt.axhline(y=1,color='r', linestyle='--')
#plt.ylim(bottom=0, top=18) # adjust the y-axis limits
plt.show()
from sklearn.cluster import AgglomerativeClustering
cluster=AgglomerativeClustering(n_clusters=2, affinity='euclidean',
linkage='ward')
cluster.fit_predict(data_scaled)
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1],
dtype=int64)
plt.figure(figsize=(10,7))
plt.scatter(data_scaled['texture_worst'], data_scaled['area_worst'], c=cluster.labels_)
<matplotlib.collections.PathCollection at 0x18dcaecec40>
Berikan simpulan yang dilakukan dari hasil kerja menggunakan algoritma dan 2 dataset yang dipilih. Simpulan bisa berkisar antara (bisa di modifikasi):
- 2 Dataset yang saya gunakan pada week ini 2 2 nya berisi data numerikal dan kategorikal sehingga bisa mendukung proses pengolahan data dari awal hingga selesai. Kemudian pada dataset breastcancer terdapat missing data sehingga saya perlu drop terlebih dahulu sebelum melakukan proses pengolahan data.
- Hierarki clustering merupakan sebuah metode pengelompokan (clustering) data yang berdasarkan pada struktur hierarki atau bertingkat. Metode ini mengelompokkan data ke dalam suatu struktur berupa pohon yang disebut dendrogram, di mana setiap simpul pada dendrogram merepresentasikan suatu cluster atau kelompok data.
- Hasil run dari dendogram untuk dataset NBA menyimpulkan bahwa jumlah cluster yang dihasilkan ada n=3 cluster, sedangkan untuk run dendrogram untuk dataset breastcancer berjumlah 2 cluster.
- Pada dataset breastcancer tepatnya column 'diagnosis' yang rownya berisi data kategorikal yakni 'M' dan 'B' saya perlu ubah terlebih dahulu menggunakan encoding dengan cara mengubah alias M dan B menjadi 0 dan 1, karena pada proses berikutnya memerlukan data numerik dan tidak boleh ada Char maupun object pada setiap COLUMN breastcancer yakni proses normalisasi data.
# Footer
myDate = datetime.datetime.now()
print("I certify that this is my own work.")
print("Signed by:")
print("Name: \t\t{}".format(myName))
print("NIM: \t\t{}".format(myNIM))
print("Time-stamp:\t{}".format(myDate))
I certify that this is my own work. Signed by: Name: Christopher Darren NIM: 00000054804 Time-stamp: 2023-05-04 19:55:07.329487
!jupyter nbconvert --to html "./IF540_Kelas_EL_00000054804_Christopher Darren_Week10.ipynb" --output-dir="./"
This application is used to convert notebook files (*.ipynb)
to various other formats.
WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.
Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
<cmd> --help-all
--debug
set log level to logging.DEBUG (maximize logging output)
Equivalent to: [--Application.log_level=10]
--show-config
Show the application's configuration (human-readable format)
Equivalent to: [--Application.show_config=True]
--show-config-json
Show the application's configuration (json format)
Equivalent to: [--Application.show_config_json=True]
--generate-config
generate default config file
Equivalent to: [--JupyterApp.generate_config=True]
-y
Answer yes to any questions instead of prompting.
Equivalent to: [--JupyterApp.answer_yes=True]
--execute
Execute the notebook prior to export.
Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
Write notebook output to stdout instead of files.
Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
Run nbconvert in place, overwriting the existing notebook (only
relevant when converting to notebook format)
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
Clear output of current file and save in place,
overwriting the existing notebook.
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
Exclude input and output prompts from converted document.
Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
Exclude input cells and output prompts from converted document.
This mode is ideal for generating code-free reports.
Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
Whether to allow downloading chromium if no suitable version is found on the system.
Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
Disable chromium security sandbox when converting to PDF..
Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
Shows code input. This flag is only useful for dejavu users.
Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
Equivalent to: [--HTMLExporter.embed_images=True]
--log-level=<Enum>
Set the log level by value or name.
Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
Default: 30
Equivalent to: [--Application.log_level]
--config=<Unicode>
Full path of a config file.
Default: ''
Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
The export format to be used, either one of the built-in formats
['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']
or a dotted object name that represents the import path for an
``Exporter`` class
Default: ''
Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
Name of the template to use
Default: ''
Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
Name of the template file to use
Default: None
Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
as prebuilt extension for the lab template)
Default: 'light'
Equivalent to: [--HTMLExporter.theme]
--writer=<DottedObjectName>
Writer class used to write the
results of the conversion
Default: 'FilesWriter'
Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
[NbConvertApp] WARNING | pattern './IF540_Kelas_EL_00000054804_Christopher Darren_Week10.ipynb' matched no files
PostProcessor class used to write the
results of the conversion
Default: ''
Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
overwrite base name use for output files.
can only be used when converting one notebook at a time.
Default: ''
Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
Directory to write output(s) to. Defaults
to output to the directory of each notebook. To recover
previous default behaviour (outputting to the current
working directory) use . as the flag value.
Default: ''
Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
The URL prefix for reveal.js (version 3.x).
This defaults to the reveal CDN, but can be any url pointing to a copy
of reveal.js.
For speaker notes to work, this must be a relative path to a local
copy of reveal.js: e.g., "reveal.js".
If a relative path is given, it must be a subdirectory of the
current directory (from which the server is run).
See the usage documentation
(https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
for more details.
Default: ''
Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
The nbformat version to write.
Use this to downgrade notebooks.
Choices: any of [1, 2, 3, 4]
Default: 4
Equivalent to: [--NotebookExporter.nbformat_version]
Examples
--------
The simplest way to use nbconvert is
> jupyter nbconvert mynotebook.ipynb --to html
Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].
> jupyter nbconvert --to latex mynotebook.ipynb
Both HTML and LaTeX support multiple output templates. LaTeX includes
'base', 'article' and 'report'. HTML includes 'basic', 'lab' and
'classic'. You can specify the flavor of the format used.
> jupyter nbconvert --to html --template lab mynotebook.ipynb
You can also pipe the output to stdout, rather than a file
> jupyter nbconvert mynotebook.ipynb --stdout
PDF is generated via latex
> jupyter nbconvert mynotebook.ipynb --to pdf
You can get (and serve) a Reveal.js-powered slideshow
> jupyter nbconvert myslides.ipynb --to slides --post serve
Multiple notebooks can be given at the command line in a couple of
different ways:
> jupyter nbconvert notebook*.ipynb
> jupyter nbconvert notebook1.ipynb notebook2.ipynb
or you can specify the notebooks list in a config file, containing::
c.NbConvertApp.notebooks = ["my_notebook.ipynb"]
> jupyter nbconvert --config mycfg.py
To see all available configurables, use `--help-all`.